import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sb
import os
import glob
import numpy as np
import datetime as dt
np.random.seed(42)
extension="csv"
all_files=[i for i in glob.glob("Data/*.{}".format(extension))]
df=pd.concat([pd.read_csv(file) for file in all_files])
df.to_csv("combined_tripdata.csv",index=False)
print(df.shape)
df.head()
df=pd.read_csv("combined_tripdata.csv")
df.head()
df_copy=df.copy()
df.drop(labels="rental_access_method",axis=1,inplace=True)
from pandas_profiling import ProfileReport
profile=ProfileReport(df,title="combined_tripdata.csv",explorative=True)
profile.to_widgets()
profile.to_widgets()
profile.to_notebook_iframe()
profile.to_file("report.html")
df.columns
df.shape
df.info(verbose=True,null_counts=True)
pd.set_option('float_format', '{:f}'.format)
df.describe()
for index,name in enumerate(df.columns):
null_count=df[name].isnull().sum();
Percentage=(null_count/df.shape[0])*100
print("Percentage of null: {0:.2f}% for column:{1}".format(Percentage,name))
df.isnull().sum()
since the null value dosent represent larg percent on the data set, dropping them will not affect the analyzes results
# code.
df.dropna(inplace=True)
#test
df.isnull().sum()
print(df.shape)
print(2506983-2262979)
change start and ending time to datetime object.
#code
df["start_time"]=pd.to_datetime(df["start_time"])
df["end_time"]=pd.to_datetime(df["end_time"])
#test
df.info(verbose=True,null_counts=True)
df.bike_share_for_all_trip.value_counts()
change values yes,no to boolean true and false.
#code
df.replace(to_replace="No",value=False,inplace=True)
df.replace(to_replace="Yes",value=True,inplace=True)
#test
df.bike_share_for_all_trip.value_counts()
df.duration_sec.value_counts()[:10]
df.duration_sec[:10]
setting duration to timedelta64[ns] dtype format as hh:mm:ss and changing column name to 'duration'
# setting duration to minutes and change columns name
#code
df.duration_sec=df.duration_sec/60
df.rename(columns={"duration_sec":"duration_min"},inplace=True)
#test
df.duplicated().sum()
df.user_type.value_counts()
df.user_type.value_counts()
df.start_station_name.value_counts()[:10]
df.end_station_name.value_counts()[:10]
def haversine(lat1, lon1, lat2, lon2, to_radians=True, earth_radius=6371):
"""
Calculate the great circle distance between two points
on the earth (specified in decimal degrees or in radians)
All (lat, lon) coordinates must have numeric dtypes and be of equal length.
"""
if to_radians:
lat1, lon1, lat2, lon2 = np.radians([lat1, lon1, lat2, lon2])
a = np.sin((lat2-lat1)/2.0)**2 + \
np.cos(lat1) * np.cos(lat2) * np.sin((lon2-lon1)/2.0)**2
return earth_radius * 2 * np.arcsin(np.sqrt(a))
Haversine Formula – Calculate geographic distance on earth. If you have two different latitude – longitude values of two different point on earth, then with the help of Haversine Formula, you can easily compute the great-circle distance (The shortest distance between two points on the surface of a Sphere). The term Haversine was coined by Prof. James Inman in 1835. Haversine is very popular and frequently used formula when developing a GIS (Geographic Information System) application or analyzing path and fields.
I want to have the distance between the start and end location so the below cordinate is of a location that I know to test the formula
lat1=37.775745
lon1=-122.213037
lat2=37.808894
lon2=-122.25646
haversine(lat1,lon1,lat2,lon2)
from IPython.display import Image
Image (filename="img.png")
looks great, this is the first start and ending point of our data set before cleaning and now we can apply the function to it.
df[["start_station_latitude",
"start_station_longitude",
"end_station_latitude",
"end_station_longitude"]].head(1)
add distance column and apply the haversine function to cordinantes column.
# code: applying the function to the dataframe
df["distance_km"]=df.apply(lambda x: haversine(x["start_station_latitude"],
x["start_station_longitude"],
x["end_station_latitude"],
x["end_station_longitude"]),axis=1)
#test
df.sort_values(by="distance_km",ascending=False).head()
df.sort_values(by="distance_km",ascending=True).head()
print("99.9%:",np.percentile(df.distance_km,99.9))
print("100%:",np.percentile(df.distance_km,100))
print(df[df["distance_km"]>10_000].shape)
df[df["distance_km"]>10_000]
we can see here a misconception or typing mistakes when computing the distance, some end point lat and long are zeros which made their distance zero but it also maybe a round trip.Also we noticed some outliers 99.9% of our data are below 7km
print(df[df.distance_km==0].shape)
df[df.distance_km==0].head()
#this is the same starting and ending points
print(df[df.end_station_latitude==0].shape)
df[df.end_station_latitude==0].head()
print(df[df.start_station_latitude==0].shape)
df[df.start_station_latitude==0].head()
#code
df=df[df.distance_km!=0]
#test
print(df[df.distance_km==0].shape)
print(df.shape)
df[df.distance_km==0].head()
Remove outliers.
#code
df=df[df["distance_km"]<10_000]
#test
print(df[df.distance_km<10_000].shape)
print(df.shape)
df[df.distance_km>10_000].head()
adding date columns based on date.
#code
df["day"]=df.start_time.dt.dayofweek
days=["Monday","Tuesday","Wednesday","Thursday","Friday","Saturday","Sunday"]
for i in range(7):
df.day.replace(to_replace=i,value=days[i],inplace=True)
#test
df[["start_time","day"]].sample(10)
df.head(3)
df.info(verbose=True,null_counts=True)
df=df[["duration_min","day","start_time","end_time","distance_km",\
"start_station_id","start_station_name","start_station_latitude","start_station_longitude"\
,"end_station_id","end_station_name","end_station_latitude","end_station_longitude","user_type","bike_share_for_all_trip"]]
df.to_csv("cleaned_combined.csv",index=False)
which tracks members who are enrolled in the Bike Share for All program for low-income residents.
(Subscriber or Customer – “Subscriber” = Member or “Customer” = Casual)
# df=df[["duration_min","day","start_time","end_time","distance_km",\
# "start_station_id","start_station_name","start_station_latitude","start_station_longitude"\
# ,"end_station_id","end_station_name","end_station_latitude","end_station_longitude","user_type","bike_share_for_all_trip"]]
# df.to_csv("combined_tripdata.csv",index=False)
# df=pd.read_csv("combined_tripdata.csv")
# df.head()
#create figure and axes object and setting number of columns of subplit grid
fig,(ax1,ax2)=plt.subplots(figsize=(12,5),ncols=2)
colors=["#BDD2A6","#C5E6A6"]
fig.suptitle('Portion of bike users in 2019', fontsize=14, fontweight='bold')
user_type=df.user_type.value_counts()
bike_sh=df.bike_share_for_all_trip.value_counts()
# first axes plotting Portion of user_type
_=ax1.pie(user_type,labels=user_type.index,startangle=90,counterclock=False,
wedgeprops={'width':1},autopct='%1.1f%%',textprops={'fontsize': 14},
colors=colors)
_=ax1.set_title("Portion of customers or subscriber",size=13,fontweight='bold')
# Equal aspect ratio ensures that pie is drawn as a circle
_=ax1.axis('equal')
# Second axes plotting Portion of bike_sh
_=ax2.pie(bike_sh,labels=bike_sh.index,startangle=90,counterclock=False,
wedgeprops={'width':1},autopct='%1.1f%%',textprops={'fontsize': 14},
colors=colors)
_=ax2.set_title("enrolled in \"Bike Share for All program\""
,size=13,fontweight='bold')
# Equal aspect ratio ensures that pie is drawn as a circle
_=ax2.axis('equal')
This pie chart shows the portion of users enrolled in "Bike Share for All Program" and the majority in the dataset are not. Bike Share for All is available to Bay Area residents ages 18 and older who qualify for Calfresh, SFMTA Lifeline Pass, or PG&E CARE utility discount.
This pie chart shows the portion of bike users wether They're a cCustomers or subscribers, the pie chart indicates that the majority are subscribed. The reason for that Probably because the fact that single rides start at \$2 for the first 30 minutes, then \\$2 per additional 15 minutes for the casual users whereas an annual membership has unlimited 45-minute Classic bike rides and discounted ebikes: Free unlocks and $0.15 per minute (waived if no Classic bikes available)
Resource: https://www.lyft.com/bikes/bay-wheels/bike-share-for-all
#create the plot
sb.set(rc={'figure.figsize':(12,5)})
dayofweek=['Monday', 'Tuesday','Wednesday','Thursday'\
,'Friday','Saturday', 'Sunday']
color=sb.color_palette()[0]
with plt.style.context('ggplot'):
sb.countplot(data=df,x="day", color= color,order=dayofweek)
plt.title("Most bike rental users in the week.")
#add annotations
n_points=df.shape[0]
day_counts=df['day'].value_counts()
locs,labels=plt.xticks()#get the current tick locations and labels
#loop thru each pair of locations nd labels
for loc,label in zip(locs,labels):
#get the text property for the label to get the correct count
count=day_counts[label.get_text()]
pct_string='{:0.1f}%'.format(100*count/n_points)
# print the annotation at the top of the bar
plt.text(loc,count-8,pct_string,ha='center',color='black',va='bottom',fontsize=12)
The majority of bike ride are in the weekdays mostly Thursday 17.3% whereas on Saturday 9% and Sunday 8.1% as we can see a major drop of bike ride users.
np.percentile(df.duration_min,100)
(df.duration_min).describe()
from matplotlib.ticker import PercentFormatter
weights=np.ones(df.shape[0])/df.shape[0]
_=plt.hist(x=df.duration_min,lw=0,bins=50,range=(0,60),weights=weights)
_=plt.axvline(x=(df.duration_min).mean(),color='red')
plt.gca().yaxis.set_major_formatter(PercentFormatter(.1))
_=plt.title("Frequent duration in minuts for bike users",size=15)
_=plt.ylabel("Frequency",size=15)
_=plt.xlabel("Minutes",size=15)
_=plt.legend(["mean","minutes"])
Here is a right skewed distribution of duration in minutes for our data (aka positively skewd distribution). The majority falls between 3 and 7 minuts bike rids.
We have a mean of mean:12.8 and median:9.4 which is lower than the mean.
The reason that most bike users are below 60 is probably because bay-wheels bike share charges for rides longer than 60 minutes for $3 in each additional 15 minutes.
df.columns
print("mean>median:",np.mean(df.duration_min),">",np.percentile(df.duration_min,50))
df[["distance_km"]].describe()
weights=np.ones(df.shape[0])/df.shape[0]
_=plt.hist(df.distance_km,bins=np.arange(0,5,.3),range=None,weights=weights,lw=1)
plt.gca().yaxis.set_major_formatter(PercentFormatter(.2))
_=plt.axvline(x=df.distance_km.mean(),color='red')
_=plt.title("Portion distance in km for bike users",size=15)
_=plt.ylabel("Frequency",size=15)
_=plt.xlabel("Distance in km",size=15)
_=plt.xticks(np.arange(0,5,.3))
_=plt.legend(["mean","distance"])
70% of 2019 bike users have distance range between 0.6 km to 1.5 km. 50% are below 1.5 km and we have an Outliers only minimum 0.004% users above 69 km. For distance we have a right skewed distribution, the mean is typically greater than the median. Also notice that the tail of the distribution on the right hand (positive) side is longer than on the left hand side.
print("mean>median:",np.mean(df.distance_km),">",np.percentile(df.distance_km,50))
# df.to_csv("cleaned_combined.csv",index=False)
pearsoncorr=df.copy()
pearsoncorr=pearsoncorr.corr(method='pearson')#new data frame that holds the pearson correlation
plt.figure(figsize=(15,8))#create a figure object
_=sb.heatmap(pearsoncorr,cmap='RdBu_r',
annot=True,annot_kws={'size':12},
linewidth=0.5,square=True)#visualization
plt.xticks(size=15);plt.yticks(size=15)#setting x & y tick size
_=plt.title("calculating the Pearson coefficient of correlation",size=17)# creating title
There is a high correlation between start station latitude and end station longitude.
from matplotlib.colors import LogNorm
fig, ax = plt.subplots()#create figure and axis for the plot
# create 2d hist for start station latitude in the x axis and end station latitude in the y axis
h=ax.hist2d(x=df.start_station_latitude,y=df.end_station_latitude,cmap=plt.cm.RdYlBu_r)
plt.colorbar(mappable=h[3],ax=ax)
plt.title("Correlation between start_station_latitude and end_station_latitude ")
plt.ylabel("end_station_latitude")
plt.xlabel("start_station_latitude")
# h[3]-> get the image from the return value so it knows what colormap and limits for colorbar
fig,ax=plt.subplots(ncols=2,nrows=2,figsize=(16,12))#create 4 figures
#rows,columns and inter these are to iterate over each figure
rows=[0,1,0,1]
columns=[0,0,1,1]
inter=list(zip(rows,columns))
'''the below nested for loop will iterate over each axis of the figure
and draw a random 10k sample of duration and day '''
for j in range(len(inter)):# This will loop 4 times
sam=df.sample(10_000)#set 10k sample df each loop
for i in sam.index:#calculate days and duration for each figure
if (sam['day'][i]=='Monday'):
ax[inter[j]].scatter(sam['day'][i],sam['duration_min'][i],color='#CB997E',label='Monday')
elif (sam['day'][i]=='Sunday'):
ax[inter[j]].scatter(sam['day'][i],sam['duration_min'][i],color='#826754',label='Sunday')
elif (sam['day'][i]=='Saturday'):
ax[inter[j]].scatter(sam['day'][i],sam['duration_min'][i],color='#8B0000',label='Saturday')
elif (sam['day'][i]=='Friday'):
ax[inter[j]].scatter(sam['day'][i],sam['duration_min'][i],color='#BFEDC1',label='Friday')
elif (sam['day'][i]=='Thursday'):
ax[inter[j]].scatter(sam['day'][i],sam['duration_min'][i],color='#D1553B',label='Thursday')
elif (sam['day'][i]=='Wednesday'):
ax[inter[j]].scatter(sam['day'][i],sam['duration_min'][i],color='#DDBEA9',label='Wednesday')
elif (sam['day'][i]=='Tuesday'):
ax[inter[j]].scatter(sam['day'][i],sam['duration_min'][i],color='#A5A58D',label='Tuesday')
# lines_labels = [ax.get_legend_handles_labels() for ax in fig.axes]
# lines, labels = [sum(lol, []) for lol in zip(*lines_labels)]
# fig.legend(lines, labels)
fig.suptitle("Drawing 10k samples of days and duration minutes",fontsize=15)#title
fig.text(-.01,.5,'Minutes',fontsize=15)#y label position
fig.text(.4,.01,'Day of the week',fontsize=15)# xlabel position
plt.show()#this will remove unwanted text
plt.figure(figsize=(20,5))
#plotting top 5 start station
sb.boxplot(
data=df.loc[df['start_station_name'].isin(df.start_station_name.value_counts().index[:5])],
x='start_station_name',
y='distance_km',
color=color)
# plt.yscale('log')
plt.title('top 5 start station name corresponding to distance in km')
plt.xticks(rotation=90, ha='right')
plt.show()
bin_size=0.25
xbin_edges=np.arange(0.6,df.start_station_id.max()+bin_size,bin_size)
xbin_centers = (xbin_edges + bin_size/2)[:-1]
duration_binned=pd.cut(df.start_station_id,xbin_edges,right=False,include_lowest=True)
distance_mean=df['distance_km'].groupby(duration_binned).mean()
distance_sems=df['distance_km'].groupby(duration_binned).sem()
plt.errorbar(x=xbin_centers,y=distance_mean,yerr=distance_sems)
plt.title("Distance & start station id ")
plt.xlabel("start station id")
plt.ylabel("Distance")
plt.show()
In the 2d histogram we visualize the variables that are highly correlated which are start_station latitude and end_station latitude.
The
# plt.figure(figsize=(4,20))
_=sb.lmplot(x='duration_min',y='distance_km',hue='user_type'
,data=df.loc[df['start_station_name'].isin(df.start_station_name.value_counts().index[:3])],fit_reg=False)
_=plt.yticks(np.arange(0,7.5,0.5))
_=plt.title("Duration corespondent to distance ")
df.start_station_name.value_counts().index[:3]
#for distance that are below 10km only because they're the majority
graph=sb.FacetGrid(df[df["distance_km"]<10],hue='bike_share_for_all_trip',col='user_type')
graph.map(plt.scatter,'distance_km','duration_min')
graph.add_legend()
plt.show()
from scipy import stats
def quantile_plot(x, **kwargs):
qntls, xr = stats.probplot(x, fit=False)
plt.scatter(xr, qntls, **kwargs)
g = sb.FacetGrid(df, col="user_type", height=4)
g.map(quantile_plot, "bike_share_for_all_trip");
plt.xticks([0,1],["True","False"])
plt.ylabel('Frequency')
df.query("user_type=='Customer' and bike_share_for_all_trip==@True")
q=df.sample(1_000)
q
df['month'] = df['start_time'].dt.month
df['start_time'].dt.month
fig = plt.figure(figsize = (12,6))
sb.pointplot(data = df, x = 'month' , y = 'duration_min' , hue = 'user_type', palette = 'Reds')
plt.title('Trip duartion across user type in different months')
plt.ylabel('Mean Dduration ')
# plt.xticks([1,2,3,4,5,6,7,8,9])
plt.xlabel('Month')
plt.show()
df.to_csv("cleaned_combined.csv",index=False)
There is not a single user who's a customer and inrolled in bike for all program at least in 2019.Also, when plotting the mean duration across months we can see that the customers have higher duration.